{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import ray\n",
    "import time\n",
    "import pickle\n",
    "from pathlib import Path\n",
    "from tqdm import tqdm\n",
    "import json\n",
    "import numpy as np\n",
    "from matplotlib import pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-11-21 11:18:40,680\tINFO services.py:1092 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://169.229.48.125:8999\u001b[39m\u001b[22m\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'node_ip_address': '169.229.48.125',\n",
       " 'raylet_ip_address': '169.229.48.125',\n",
       " 'redis_address': '169.229.48.125:6379',\n",
       " 'object_store_address': '/tmp/ray/session_2020-11-21_11-18-40_166968_128289/sockets/plasma_store',\n",
       " 'raylet_socket_name': '/tmp/ray/session_2020-11-21_11-18-40_166968_128289/sockets/raylet',\n",
       " 'webui_url': '169.229.48.125:8999',\n",
       " 'session_dir': '/tmp/ray/session_2020-11-21_11-18-40_166968_128289',\n",
       " 'metrics_export_port': 65021,\n",
       " 'node_id': '1972d1bea4005ceca5bf563f73ac9bd92158f125'}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ray.shutdown()\n",
    "ray.init(dashboard_host='0.0.0.0', dashboard_port=8999)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 444/444 [00:00<00:00, 1297.09it/s]\n",
      "100%|██████████| 442/442 [00:00<00:00, 91185.01it/s]\n"
     ]
    }
   ],
   "source": [
    "challenge_list_path = Path('../data/hackerrank/algorithms_challenge_index.json')\n",
    "base_path = Path('../data/hackerrank/')\n",
    "dirs = [(x / 'challenge_data.json') for x in tqdm(list(base_path.iterdir())) if x.is_dir() and (x / 'challenge_data.json').exists()]\n",
    "\n",
    "with challenge_list_path.open('r') as f:\n",
    "    challenge_list = json.load(f)\n",
    "extract_fields = ['slug', 'max_score', 'success_ratio', 'preview', 'difficulty_name', 'tag_names']\n",
    "challenge_list = [{k: v for k, v in challenge.items() if k in extract_fields} for challenge in tqdm(challenge_list)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading data: 100%|██████████| 437/437 [00:00<00:00, 467.45it/s]\n"
     ]
    }
   ],
   "source": [
    "data = {}\n",
    "for index in tqdm(dirs, desc=\"Loading data\"):\n",
    "    challenge = index.parent.name\n",
    "    with index.open('r') as f:\n",
    "        data[challenge] = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 437/437 [38:16<00:00,  5.26s/it] \n"
     ]
    }
   ],
   "source": [
    "def fn(tup):\n",
    "    import editdistance\n",
    "    import numpy as np\n",
    "    challenge, submissions = tup\n",
    "    if submissions is None:\n",
    "        return challenge, None\n",
    "    N = len(submissions)\n",
    "    dist_mat = np.zeros((N, N))\n",
    "    for i in range(N):\n",
    "        dist_mat[i, i] = 0\n",
    "        for j in range(i + 1, N):\n",
    "            a = submissions[i]['src']\n",
    "            b = submissions[j]['src']\n",
    "            if a is not None and b is not None:\n",
    "                dist = float(editdistance.distance(a, b)) / max(len(a), len(b))\n",
    "                dist_mat[i][j] = dist\n",
    "                dist_mat[j][i] = dist\n",
    "    return challenge, dist_mat\n",
    "\n",
    "data_iter = ray.util.iter.from_items(list(data.items()), num_shards=36)\n",
    "vecs = data_iter.for_each(fn, max_concurrency=4)\n",
    "\n",
    "distance_map = {}\n",
    "for challenge, distance_matrix in tqdm(vecs.gather_async(batch_ms=1000), total=len(data)):\n",
    "    distance_map[challenge] = distance_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 442/442 [00:00<00:00, 4552.20it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "easy 117\n",
      "medium 134\n",
      "hard 91\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "difficulty_mapping = dict(easy='easy', medium='medium', hard='hard', advanced='hard', expert='hard')\n",
    "challenge_by_difficulty = dict(easy={}, medium={}, hard={})\n",
    "for challenge in tqdm(challenge_list):\n",
    "    key = challenge['slug']\n",
    "    if key in distance_map:\n",
    "        x = distance_map[key]\n",
    "        norms = np.sum(np.abs(x)**2,axis=-1)**(1./2)\n",
    "        idxs = np.flip(norms.argsort())\n",
    "        correct_idxs = [idx for idx in idxs if data[key][idx]['score'] >= challenge['max_score']]\n",
    "        top_correct_idxs = correct_idxs[:10]\n",
    "        if len(x) > 0 and len(top_correct_idxs) > 0:\n",
    "            out_dict = challenge.copy()\n",
    "            out_dict['srcs'] = [data[key][idx]['src'] for idx in top_correct_idxs]\n",
    "            out_dict['hacker_ids'] = [data[key][idx]['hacker_id'] for idx in top_correct_idxs]\n",
    "            out_dict['dissimilarity_matrix'] = np.zeros((len(top_correct_idxs), len(top_correct_idxs))).tolist()\n",
    "            for x1, i in enumerate(top_correct_idxs):\n",
    "                for y1, j in enumerate(top_correct_idxs):\n",
    "                    out_dict['dissimilarity_matrix'][x1][y1] = x[i][j]\n",
    "            challenge_by_difficulty[difficulty_mapping.get(challenge['difficulty_name'].lower())][key] = out_dict\n",
    "\n",
    "for k, v in challenge_by_difficulty.items():\n",
    "    print(k, len(v))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "with Path('../data/hackerrank/full_data.json').open('w') as f:\n",
    "    json.dump(challenge_by_difficulty, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
